DET File Structure

The provided training data include image file raw 16 bits format and the detection list.

A detection list associated with the set of images which contains a list of information for the detected objects, in space delimited format with 16 columns and each row representing a single detection in one of the 4 original FITS images.

The columns are:

Unique ID -- An identifier for what detected object a row belongs to

Detection Number -- sequential numbering of detection output of the currently used detection software

Frame Number -- which observation is this row relevant to (1, 2, 3 or 4)

Sexnum -- Source extractor number of the object

Time -- Julian date

RA -- right ascension of object in decimal hours

DEC -- declination in decimal degrees

X -- location in pixels of the object in the original FITS image

Y -- location in pixels of the object in the original FITS image

Magnitude -- brightness of the object in magnitudes

FWHM -- full width at half maximum of Gaussian fit in pixels

Elong -- ratio of long axis to short axis

Theta -- position angle of the long axis

RMSE -- error in fit to straight line

Deltamu -- from Source Extractor, peak value minus threshold over background

Rejected -- this value will be 1 if the operator rejected the detection, 0 otherwise. This column will only be available during the training phase. You need to predict this column

TRAINING

1. Training with original columns



In [11]:

    
import pandas as pd

def onetofour(x):
    return x[0],x[1],x[2],x[3]

names = ["id","det_num", "frame_num", "sex_num", "time", "RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]
df = pd.read_csv("E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_files/01_12DEC03_N01014.det", header=None, names=names, delim_whitespace=True)

dfg = df.groupby('id',as_index=False).agg(lambda x: x.tolist())

new_names = ["RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]
new_df = pd.DataFrame()

new_df['id'] = dfg['id']

for nn in new_names:
    new_df[nn + '0'],new_df[nn + '1'],new_df[nn + '2'],new_df[nn + '3'] = zip(*dfg[nn].map(onetofour))

# new_df['rejected'] = zip(*dfg['rejected'].map(onebyone))
new_df.drop(['rejected1','rejected2','rejected3'], axis=1, inplace=True)

new_df









    Out[11]:






  
    
      
      id
      RA0
      RA1
      RA2
      RA3
      DEC0
      DEC1
      DEC2
      DEC3
      X0
      ...
      Elong3
      theta0
      theta1
      theta2
      theta3
      RMSE0
      RMSE1
      RMSE2
      RMSE3
      rejected0
    
  
  
    
      0
      9
      1.10287
      1.09605
      1.08769
      1.08077
      1231.906
      1216.889
      1184.891
      1206.527
      1599.445
      ...
      40.8
      1.15
      1.15
      1.15
      1.15
      0.75
      0.63
      0.85
      0.55
      1
    
    
      1
      14
      2.64320
      2.64439
      2.64506
      2.64546
      1588.978
      1584.985
      1567.276
      1600.578
      3816.586
      ...
      -51.1
      1.03
      1.03
      1.03
      1.03
      0.94
      0.73
      0.71
      1.17
      0
    
    
      2
      15
      0.32087
      0.33389
      0.34766
      0.36181
      1606.368
      1601.588
      1580.328
      1613.180
      475.475
      ...
      45.4
      1.63
      1.63
      1.63
      1.63
      0.45
      0.77
      0.92
      0.39
      1
    
    
      3
      16
      0.80880
      0.80142
      0.79528
      0.78851
      1682.412
      1668.442
      1640.725
      1662.952
      1177.924
      ...
      90.0
      1.66
      1.66
      1.66
      1.66
      0.51
      0.66
      0.49
      0.42
      1
    
    
      4
      25
      1.40177
      1.40288
      1.40415
      1.40520
      2359.459
      2355.695
      2338.046
      2371.710
      2033.481
      ...
      -15.0
      0.71
      0.71
      0.71
      0.71
      0.86
      0.87
      1.28
      1.26
      0
    
    
      5
      26
      1.57154
      1.56891
      1.56676
      1.56490
      2441.502
      2439.360
      2422.811
      2457.871
      2278.006
      ...
      -74.6
      0.84
      0.84
      0.84
      0.84
      0.85
      0.87
      0.82
      0.90
      0
    
    
      6
      36
      2.69883
      2.70075
      2.70079
      2.70141
      3290.256
      3279.316
      3252.771
      3277.572
      3903.198
      ...
      70.9
      1.65
      1.65
      1.65
      1.65
      0.43
      0.70
      0.72
      0.67
      1
    
    
      7
      37
      0.60061
      0.60100
      0.60131
      0.60188
      3610.243
      3605.142
      3583.382
      3614.265
      885.432
      ...
      0.0
      0.77
      0.77
      0.77
      0.77
      0.43
      0.47
      0.89
      0.15
      1
    
    
      8
      50
      2.79557
      2.79155
      2.78740
      2.78329
      205.287
      237.773
      257.854
      326.350
      4031.637
      ...
      0.0
      0.76
      0.76
      0.76
      0.76
      1.08
      0.00
      1.12
      1.15
      1
    
    
      9
      58
      2.29163
      2.23216
      2.16983
      2.11012
      3354.141
      3337.433
      3305.431
      3324.947
      3317.545
      ...
      8.3
      0.61
      0.61
      0.61
      0.61
      1.00
      0.00
      1.18
      1.15
      1
    
    
      10
      62
      1.07723
      1.07181
      1.06610
      1.06070
      3506.649
      3505.657
      3489.361
      3525.458
      1570.936
      ...
      88.9
      0.51
      0.51
      0.51
      0.51
      1.00
      0.00
      1.19
      1.73
      1
    
  

11 rows × 38 columns



In [4]:

    
import glob
import os
import pandas as pd

from IPython.display import display, HTML

DIR = "E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_files/"
OUT_DIR = "E:/Dai Hoc/Deep Learning/Near Object/hackspace-2016/data/det_training_files/"

names = glob.glob(DIR + "*.det")

header = ["id", "det_num", "frame_num", "sex_num", "time", "RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]

new_names = ["RA", "DEC", "X", "Y", "mag", "FWHE", "Elong", "theta", "RMSE", "rejected"]

training_input = []

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

for name in names:

    df = pd.read_csv(name, header=None, names=header, delim_whitespace=True)
    
    if df.empty:
        continue
        
    dfg = df.groupby('id',as_index=False).agg(lambda x: x.tolist())
    
    new_df = pd.DataFrame()

    new_df['id'] = dfg['id']

    for nn in new_names:
        new_df[nn + '0'],new_df[nn + '1'],new_df[nn + '2'],new_df[nn + '3'] = zip(*dfg[nn].map(onetofour))

    # new_df['rejected'] = zip(*dfg['rejected'].map(onebyone))
    new_df.drop(['rejected1','rejected2','rejected3'], axis=1, inplace=True)
    
    
    training_input.append(new_df)
    
    pathOutput = OUT_DIR + name[len(DIR):]
    new_df.to_csv(pathOutput, header=None, sep=' ')



In [10]:

    
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

training_data = pd.concat(training_input)

Y = training_data['rejected0']
Y = Y.as_matrix()

training_data.drop(['id', 'rejected0'], axis=1, inplace=True)
X = training_data.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# fit estimator
est = GradientBoostingClassifier(n_estimators=200, max_depth=3)
est.fit(X_train, y_train)

# predict class labels
pred = est.predict(X_test)

# score on test data (accuracy)
acc = est.score(X_test, y_test)
print('ACC: %.4f' % acc)

# predict class probabilities
est.predict_proba(X_test)[0]









    



ACC: 0.9765






    Out[10]:





array([ 0.00202141,  0.99797859])



In [9]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split

training_data = pd.concat(training_input)


training_data.to_csv("test.csv", header=None, sep=' ')    
Y = training_data['rejected0']
Y = Y.as_matrix()

training_data.drop(['id', 'rejected0'], axis=1, inplace=True)
X = training_data.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print (y_test != y_pred).sum()
print (y_test == y_pred).sum()

	id	RA0	RA1	RA2	RA3	DEC0	DEC1	DEC2	DEC3	X0	...	Elong3	theta0	theta1	theta2	theta3	RMSE0	RMSE1	RMSE2	RMSE3	rejected0
0	9	1.10287	1.09605	1.08769	1.08077	1231.906	1216.889	1184.891	1206.527	1599.445	...	40.8	1.15	1.15	1.15	1.15	0.75	0.63	0.85	0.55	1
1	14	2.64320	2.64439	2.64506	2.64546	1588.978	1584.985	1567.276	1600.578	3816.586	...	-51.1	1.03	1.03	1.03	1.03	0.94	0.73	0.71	1.17	0
2	15	0.32087	0.33389	0.34766	0.36181	1606.368	1601.588	1580.328	1613.180	475.475	...	45.4	1.63	1.63	1.63	1.63	0.45	0.77	0.92	0.39	1
3	16	0.80880	0.80142	0.79528	0.78851	1682.412	1668.442	1640.725	1662.952	1177.924	...	90.0	1.66	1.66	1.66	1.66	0.51	0.66	0.49	0.42	1
4	25	1.40177	1.40288	1.40415	1.40520	2359.459	2355.695	2338.046	2371.710	2033.481	...	-15.0	0.71	0.71	0.71	0.71	0.86	0.87	1.28	1.26	0
5	26	1.57154	1.56891	1.56676	1.56490	2441.502	2439.360	2422.811	2457.871	2278.006	...	-74.6	0.84	0.84	0.84	0.84	0.85	0.87	0.82	0.90	0
6	36	2.69883	2.70075	2.70079	2.70141	3290.256	3279.316	3252.771	3277.572	3903.198	...	70.9	1.65	1.65	1.65	1.65	0.43	0.70	0.72	0.67	1
7	37	0.60061	0.60100	0.60131	0.60188	3610.243	3605.142	3583.382	3614.265	885.432	...	0.0	0.77	0.77	0.77	0.77	0.43	0.47	0.89	0.15	1
8	50	2.79557	2.79155	2.78740	2.78329	205.287	237.773	257.854	326.350	4031.637	...	0.0	0.76	0.76	0.76	0.76	1.08	0.00	1.12	1.15	1
9	58	2.29163	2.23216	2.16983	2.11012	3354.141	3337.433	3305.431	3324.947	3317.545	...	8.3	0.61	0.61	0.61	0.61	1.00	0.00	1.18	1.15	1
10	62	1.07723	1.07181	1.06610	1.06070	3506.649	3505.657	3489.361	3525.458	1570.936	...	88.9	0.51	0.51	0.51	0.51	1.00	0.00	1.19	1.73	1